!pip install wordcloud
import pandas as pd
import numpy as np
import scipy.stats as scs
import statsmodels.api as sm
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
%matplotlib inline
%config InlineBackend.figure_format='retina'
df = pd.read_csv('small_descr_clm_code.csv')
df.drop('Unnamed: 0',axis=1, inplace=True)
df.head()
df['descr_clm'] = df.descr + df.clm
df.drop(['descr','clm'],axis=1, inplace=True)
df['code'] = df['code'].astype('category')
df.head()
df_705 = df[df['code']==705]
df_705.head()
custom_stopword_list = ['system','process','method','one', 'may','claim','embodiment','invention','include', 'example', 'include','step','figure','fig']
stopwords is a set so to add a list of words into that set I can use set.add() for an element but that didn't work for a list. I used set |= set(list) . it is a Union function.
stopwords = STOPWORDS
stopwords |= set(custom_stopword_list)
text = df.descr_clm.values
wordcloud = WordCloud(
width = 3000,
height = 2000,
background_color = 'black',
stopwords = stopwords).generate(str(text))
fig = plt.figure(
figsize = (40, 30),
facecolor = 'k',
edgecolor = 'k')
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()
https://github.com/keyonvafa/inaugural-wordclouds/blob/master/create_wordclouds.py
!pip install gensim
import io
from nltk.tokenize import RegexpTokenizer
from gensim import corpora, models
import gensim
from PIL import Image
import PIL.ImageOps
import random
from wordcloud import ImageColorGenerator
comment_words = ' '
stopwords = set(STOPWORDS)
# iterate through the csv file
for val in df_705.descr_clm:
# typecaste each val to string
val = str(val)
# split the value
tokens = val.split()
# Converts each token into lowercase
for i in range(len(tokens)):
tokens[i] = tokens[i].lower()
for words in tokens:
comment_words = comment_words + words + ' '
wordcloud = WordCloud(width = 800, height = 800,
background_color ='white',
stopwords = stopwords,
min_font_size = 10).generate(comment_words)
# plot the WordCloud image
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.show()